优快云收藏备份Python脚本

最新推荐文章于 2024-05-31 03:00:00 发布

XNightSky

最新推荐文章于 2024-05-31 03:00:00 发布

阅读量566

点赞数 2

CC 4.0 BY-SA版权

分类专栏： Python

本文链接：https://blog.youkuaiyun.com/summer__xt/article/details/8790679

Python 专栏收录该内容

1 篇文章

订阅专栏

本文介绍了一个简单的Python2脚本，用于备份优快云空间中的收藏夹，并以此来验证Python2的学习成果。该脚本利用了liburl2和正则表达式(re)进行网页抓取和解析。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

备份一下优快云空间的收藏模块，顺便检测一下 python2 的学习成果，简单的使用 liburl2 和 re，轻拍。

#!\usr\bin\env python2
# -*- coding: utf-8 -*-
import re
import os
import time
import pdb

FAVORITE_USER = 'summer__xt'
FAVORITE_ULR = 'http://my.youkuaiyun.com/' + FAVORITE_USER + '/favorite'
FAVORITE_HEAD__USER_AGENT= 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0;)'
STR_TAB = '    '
g_favorite_page_count = 1
g_favorite_page_index = 1
g_favorite_items = []

def loop_favorite():
    global g_favorite_page_count
    g_favorite_page_index = 1
    while g_favorite_page_index <= g_favorite_page_count:
        url = (FAVORITE_ULR + '/%d') % g_favorite_page_index
        print '%s%s' % (url, os.linesep) 
        html = get_favorite(url)
        page_count = int(parse_favorite(html))
#        可能没有尾页项
        g_favorite_page_count = page_count if 0 != page_count else g_favorite_page_count;
        print '{index: %d, parse: %d, count: %d}' % (g_favorite_page_index,  page_count,  g_favorite_page_count)
        g_favorite_page_index += 1
        
    save_favorite()
    print 'DONE!'
    pass
    
def get_favorite(url):
    import urllib2
    favorite_req = urllib2.Request(url, headers={'User-Agent' : FAVORITE_HEAD__USER_AGENT}) 
    favorite = urllib2.urlopen(favorite_req)
    return favorite.read()

def parse_favorite(html):
    m =  re.search(
        r'<div[^>]+class="fav_list">(?:.|\r|\n)+?</div>', 
        html
        )
    items_html = m.group()
    lst_item = re.findall(
        r'<dl>[^<]+<dt>[^<]*<span class="time">(.+)</span>[^<]*<span class="title">[^<]*<a href="([^"]+)">([^<]+)</a>[^<]*</span>[^<]*</dt>[^<]*(?:<dd>[^<]*<strong>标签：</strong>[^<]*((?:<a[^>]+href="[^"]*"[^>]*>[^<]*</a>[^<]*)+)</dd>)?[^<]*(?:[^<]*<dd>[^<]*<strong>描述：</strong>((?:.|\r|\n)+?)</dd>[^<]*)?</dl>',   
        items_html
        )
    if 20 != len(lst_item) and 1 != g_favorite_page_index and g_favorite_page_count != g_favorite_page_index:
        pdb.set_trace()
    tags = []
    lst_it_item = []
    for it_item in lst_item:
        tags = re.findall(
            r'<a href="[^"]*">([^<]*)</a>', 
            it_item[3]
            )
        lst_it_item = list(it_item)
        lst_it_item[3] = tags
        
        g_favorite_items.append(lst_it_item)
        
    print '{globe_item: %d, current_item: %d}' % ( len(lst_item),  len(g_favorite_items) )
    return parse_footer(html)

def parse_footer(html):
    m = re.findall(
        r'<a class="pageliststy" href="/' + FAVORITE_USER + r'/favorite/(\d+)">尾页</a>',  
        html
        )
    return m[0] if 1 == len(m) else 0
    
def save_favorite():
    lt = time.localtime()
    build_file_date = time.strftime("%Y%m%d_%H%M%S", lt) 
    build_date = time.strftime("%Y-%m-%d %H:%M:%S", lt) 
    favorite_file_name = '%s\[%s]优快云FavoriteBackup_%s.xml' % (
        os.getcwd(),  
        FAVORITE_USER,  
        build_file_date
        )
    print favorite_file_name
    try:
        fo = open(favorite_file_name,  'w+')
        fo.write(favorite_tostring(build_date))
    except Exception,  e:
        print e
    finally:
        fo.close()
        
    pass
    
def favorite_tostring(build_date):
    global g_favorite_items
    xmlinfo = '<favorites count="%d" build_date="%s">' % (
        len(g_favorite_items),  
        build_date
        )
    for it in g_favorite_items:
        xmlinfo += STR_TAB + '<item>'
        xmlinfo += STR_TAB * 2 + '<time>%s</time>' % it[0]
        xmlinfo += STR_TAB * 2 + '<url><![CDATA[%s%s%s]]></url>' % (os.linesep, it[1], os.linesep)
        xmlinfo += STR_TAB * 2 + '<title><![CDATA[%s%s%s]]></title>' % (os.linesep, it[2], os.linesep)
        xmlinfo += STR_TAB * 2 + '<tags count="%d">' % len(it[3])
        for it_tag in it[3]:
            xmlinfo += STR_TAB * 3 + '<tag>%s</tag>' % it_tag
        xmlinfo += STR_TAB * 2 + '</tags>'
        if 0 != len(it[4]):
            xmlinfo += STR_TAB * 2 + '<description><![CDATA[%s%s%s]]></description>' % (os.linesep, it[4], os.linesep)
        xmlinfo += STR_TAB + '</item>'
    xmlinfo += "</favorites>"
    return xmlinfo
    
if '__main__' == __name__:
   loop_favorite()