优快云收藏备份Python脚本

本文介绍了一个简单的Python2脚本,用于备份优快云空间中的收藏夹,并以此来验证Python2的学习成果。该脚本利用了liburl2和正则表达式(re)进行网页抓取和解析。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

备份一下优快云空间的收藏模块,顺便检测一下 python2 的学习成果,简单的使用 liburl2 和 re,轻拍。


#!\usr\bin\env python2
# -*- coding: utf-8 -*-
import re
import os
import time
import pdb

FAVORITE_USER = 'summer__xt'
FAVORITE_ULR = 'http://my.youkuaiyun.com/' + FAVORITE_USER + '/favorite'
FAVORITE_HEAD__USER_AGENT= 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0;)'
STR_TAB = '    '
g_favorite_page_count = 1
g_favorite_page_index = 1
g_favorite_items = []

def loop_favorite():
    global g_favorite_page_count
    g_favorite_page_index = 1
    while g_favorite_page_index <= g_favorite_page_count:
        url = (FAVORITE_ULR + '/%d') % g_favorite_page_index
        print '%s%s' % (url, os.linesep) 
        html = get_favorite(url)
        page_count = int(parse_favorite(html))
#        可能没有尾页项
        g_favorite_page_count = page_count if 0 != page_count else g_favorite_page_count;
        print '{index: %d, parse: %d, count: %d}' % (g_favorite_page_index,  page_count,  g_favorite_page_count)
        g_favorite_page_index += 1
        
    save_favorite()
    print 'DONE!'
    pass
    
def get_favorite(url):
    import urllib2
    favorite_req = urllib2.Request(url, headers={'User-Agent' : FAVORITE_HEAD__USER_AGENT}) 
    favorite = urllib2.urlopen(favorite_req)
    return favorite.read()

def parse_favorite(html):
    m =  re.search(
        r'<div[^>]+class="fav_list">(?:.|\r|\n)+?</div>', 
        html
        )
    items_html = m.group()
    lst_item = re.findall(
        r'<dl>[^<]+<dt>[^<]*<span class="time">(.+)</span>[^<]*<span class="title">[^<]*<a href="([^"]+)">([^<]+)</a>[^<]*</span>[^<]*</dt>[^<]*(?:<dd>[^<]*<strong>标签:</strong>[^<]*((?:<a[^>]+href="[^"]*"[^>]*>[^<]*</a>[^<]*)+)</dd>)?[^<]*(?:[^<]*<dd>[^<]*<strong>描述:</strong>((?:.|\r|\n)+?)</dd>[^<]*)?</dl>',   
        items_html
        )
    if 20 != len(lst_item) and 1 != g_favorite_page_index and g_favorite_page_count != g_favorite_page_index:
        pdb.set_trace()
    tags = []
    lst_it_item = []
    for it_item in lst_item:
        tags = re.findall(
            r'<a href="[^"]*">([^<]*)</a>', 
            it_item[3]
            )
        lst_it_item = list(it_item)
        lst_it_item[3] = tags
        
        g_favorite_items.append(lst_it_item)
        
    print '{globe_item: %d, current_item: %d}' % ( len(lst_item),  len(g_favorite_items) )
    return parse_footer(html)

def parse_footer(html):
    m = re.findall(
        r'<a class="pageliststy" href="/' + FAVORITE_USER + r'/favorite/(\d+)">尾页</a>',  
        html
        )
    return m[0] if 1 == len(m) else 0
    
def save_favorite():
    lt = time.localtime()
    build_file_date = time.strftime("%Y%m%d_%H%M%S", lt) 
    build_date = time.strftime("%Y-%m-%d %H:%M:%S", lt) 
    favorite_file_name = '%s\[%s]优快云FavoriteBackup_%s.xml' % (
        os.getcwd(),  
        FAVORITE_USER,  
        build_file_date
        )
    print favorite_file_name
    try:
        fo = open(favorite_file_name,  'w+')
        fo.write(favorite_tostring(build_date))
    except Exception,  e:
        print e
    finally:
        fo.close()
        
    pass
    
def favorite_tostring(build_date):
    global g_favorite_items
    xmlinfo = '<favorites count="%d" build_date="%s">' % (
        len(g_favorite_items),  
        build_date
        )
    for it in g_favorite_items:
        xmlinfo += STR_TAB + '<item>'
        xmlinfo += STR_TAB * 2 + '<time>%s</time>' % it[0]
        xmlinfo += STR_TAB * 2 + '<url><![CDATA[%s%s%s]]></url>' % (os.linesep, it[1], os.linesep)
        xmlinfo += STR_TAB * 2 + '<title><![CDATA[%s%s%s]]></title>' % (os.linesep, it[2], os.linesep)
        xmlinfo += STR_TAB * 2 + '<tags count="%d">' % len(it[3])
        for it_tag in it[3]:
            xmlinfo += STR_TAB * 3 + '<tag>%s</tag>' % it_tag
        xmlinfo += STR_TAB * 2 + '</tags>'
        if 0 != len(it[4]):
            xmlinfo += STR_TAB * 2 + '<description><![CDATA[%s%s%s]]></description>' % (os.linesep, it[4], os.linesep)
        xmlinfo += STR_TAB + '</item>'
    xmlinfo += "</favorites>"
    return xmlinfo
    
if '__main__' == __name__:
   loop_favorite()
   


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值