# -*- coding: utf-8 -*-
import urllib2
import re
import cookielib
import sys
class 优快云:
def __init__(self):
self.url = 'http://blog.youkuaiyun.com/aricover/article/details/78684894/'
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = { 'User-Agent' : self.user_agent }
self.filename = 'csdncookie.txt'
self.patterns = {
'<p>':'',
'</p>':'',
'<pre name="code".*?>':'',
'</pre>':'\n',
'<br>':'',
'</br>':'',
'<div.*?>':'',
'</div>':'',
r'<':'<',
r'>':'>',
'+':'+',
'"':'\"'
}
def getPage(self):
try:
req = urllib2.Request(self.url,headers = self.headers)
cookie = cookielib.MozillaCookieJar(self.filename)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
response = opener.open(req)
cookie.save(ignore_discard=True,ignore_expires=True)
content = response.read()
return content
except urllib2.URLError,e:
if hasattr(e,'reason'):
print '错误发生的原因:%s' %(e.reason)
def replaceSB(self,content):
for k,v in self.patterns.items():
pattern = re.compile(k,re.S)
content = re.sub(pattern,v,content)
return content
def write2File(self,content):
file = open('blog.txt','w')
file.write(content)
file.close();
def getContent(self):
content = self.getPage()
pattern = re.compile(r'<div id="article_content".*?</div>',re.S)
items = re.findall(pattern,content)
for item in items:
zz = self.replaceSB(item)
self.write2File(zz)
print zz
csdn = 优快云()
csdn.getContent()
练练手,熟悉下python和爬虫,这里以抓取自己的博文为例,只是爬去特定链接的文章,不是大范围爬取
存在部分字符被csdn的编辑器给替换了,源码链接http://download.youkuaiyun.com/download/aricover/10148984#