(1)取文件路径
from unipath import Path
ofn = 'test.txt'
self.cwd = Path(__file__).ancestor(1)
ofn_path = Path(self.cwd, ofn)
(2)打印错误堆栈信息
import traceback
logger.error('error [%s]'% (traceback.format_exc()))
(3)测试方法用时
from timeit import Timer
t1=Timer("test()","from __main__ import test")
print t1.timeit(10000)
print min(t1.repeat(3,10000))
(3)测试内存回收和使用
import gc , objgraph
gc.collect()
objgraph.show_most_common_types(limit=50)
(4) 测试用时
before = datetime.datetime.now()
。。。
end = datetime.datetime.now()
logger.error('init [%s]'% (end - before))
(5) 网页抓取
import urllib
def fetch(url):
#logger.error('fetch:%s' % url)
content = u''
try:
status = urllib.urlopen(url)
if status.getcode() == 200:
content = status.read()
content = unicode(content.strip(), 'utf-8', 'ignore')
else:
logger.error('fetch error [%s]' % url)
except:
logger.error('fetch error %s' % traceback.format_exc())
return content
(6) 网页解析
import re
def parse_html(self, url, content):
# logger.error('url[%s][%s]' % (url,content))
title_pattern = r'<ul class="movie_list">(?P<detail_content>.*?)</ul>'
detail_pattern = r'<li>.*?href="(?P<detail_url>.*?)".*?title="(?P<detail_title>.*?)">.*?</li>'
list_res = []
res = re.search(title_pattern,content,re.S)
if res:
detail_content = res.group('detail_content')
#logger.error('parse_html:%s' % detail_content)
list_res = re.findall(detail_pattern, detail_content, re.S)
if not list_res:
list_res = []