最近开始接触python,光学习语法感觉有些犯困.既然是脚本语言,还是制作一些工具好一些,而有关python,最出名的大概就是爬虫了.去网上找了一些例子,最后自己完成了一个小爬虫.
- 环境 : python 2.7
- 需求 : 获取某个帖子中的发帖内容,组合成一个网页
制作简单爬虫不需要额外下载模块,只需要导入几个自带模块就可以
# -*- coding: utf-8 -*-
import urllib2 #http请求类
urllib_request = urllib2 #因为3.0与2.7的引用不同,我为了方便记忆,重新命名了一遍urllib2对象
import re #通过正则表达式
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')#在后面文件写入的时候出了一些问题,查阅之后的解决方式
其实获取到网页内容本身并不是很困难,之前用多种语言都实现过,用python更加简单
response = urllib_request.urlopen(self.url, timeout = 60)
self.content = response.read().decode('utf-8', 'ignore')
这样就获得了帖子内容了(self是因为使用了对象)
之后才是重头戏,如何筛选出自己需要的内容.需要内容–发帖人,帖子内容.看来也很简单.只要有些HTML基础,就可以用chrome等网页调试工具找到规则.
在当前例子里,发帖人的div id 为 p_author,要做的也很明显了,从上面获得的一大串代码中获得发帖人.
parttern = re.compile(r'p_author.*?</a>') # 将正则传入
authorer = re.findall(parttern, self.content) #获取所有符合正则的内容
authorerContent = ''.join(authorer) #将数组转换为字符串
authorer1 = re.findall(r'username=".*?"', authorerContent) #以下是逐步筛选
authorer1Content = ''.join(authorer1)
authorer2 = re.findall(r'".*?"', authorer1Content)
authorer2Content = ''.join(authorer2)
authorer2ContentFix = authorer2Content.replace('""', '|')
authorer2ContentFix1 = authorer2ContentFix.replace('"', '')
authorList = authorer2ContentFix1.split('|')
通过上面一串,获得了一个只包含发帖人的数组.这才是我们需要的一部分数据.
下面获得帖子内容,原理类似
article = re.findall(r'j_d_post_content.*?</div>', self.content) #re可以直接将正则传入而不单独生成parttern
articleContent = ''.join(article)
articleContent = articleContent.replace('j_d_post_content ">', '')
articleList = articleContent.split('</div>')
articleList.pop() #这里获得的数组最后一个数据项为空,清除
self.articleContent = ''.join(articleList) #这里是为了之后从发帖内容里筛选其他内容,比如,图片
这样就获得了发帖内容.之后通过一些基础操作,将两个数组的内容进行匹配就好像完成了需求.为什么要说好像?因为百度的图片是不允许外链的,你如果将爬到的网页直接放到你的服务器上,就会出现公益广告.那么下一步要做的就是–在发帖内容中获取图片地址,下载下来,替换掉原来地址,换成自己服务器的地址.
听起来有些麻烦,但做起来依旧很简单.
先理清思路
- 通过正则获取图片路径
- 下载图片到本地
- 替换掉原来的链接
正则获取路径已经很熟悉了,
img = re.findall(r'src=".*?"', self.articleContent)
imgContent = ''.join(img)
imgContent1 = imgContent.replace('src="', '')
imgContent2 = imgContent1.replace('"', '|')
imgList = imgContent2.split('|')
imgList.pop() #最后一个数据项为空
下载图片使用open()函数,不清楚的可以去找找教程,注意获取图片名称,这是我们需要的数据
for i in range(len(imgList)) :
imgUris = imgList[i].split('/') #通过路径分割获得图片名称
imgName = imgUris.pop()
imgNameList.insert(0, imgName)
imgNameList.reverse() #因为insert每次都从首位插入,为了与URL的数组进行匹配需要倒置数组
for i in range(len(imgNameList)) : #下面是下载图片到本地的代码
try :
filename = r'catchimg/' + imgNameList[i].strip()
if os.path.exists(filename) :
continue
conn = urllib_request.urlopen(imgList[i])
with open(filename, 'wb') as f :
f.write(conn.read())
f.close
except Exception :
print('img Error')
现在我们完成了两项工作,第一项获得图片路径,第二项下载到了catchimg文件夹.第三项我是用js代码解决的,有方便的document对象可以使用,在输出的HTML文件中,拼接javascript代码
postingContentAndJs = '<div id= "123000">' + postingContent + """
</div>
<script type = "text/javascript" >
var aHerfs = document.getElementById('123000').getElementsByTagName('a');
for (var i = 0; i < aHerfs.length; i++) {
aHerfs[i].onclick = function () { return false;};
aHerfs[i].href = '#';
aHerfs[i].style.color = '#000000';
aHerfs[i].style.cursor = 'default';
}
var imgs = document.getElementById('123000').getElementsByTagName('img');
for (var i = 0; i < imgs.length; i++) {
var src = imgs[i].src;
var srcUris = src.split('/');
var localSrc = 'catchimg/' + srcUris[srcUris.length - 1];
imgs[i].src = localSrc;
}
</script>
<br>
<div style = "width:100%;text-align:center">
<a href = "
"""+ url +"""
" target = "_blank"><input type = "button" value = "贴吧见!"></a>
|
<a href = "baidu
"""+ str(1 if (int(currentPage) - 1) < 1 else (int(currentPage) - 1)) +"""
.htm"><input type = "button" value = "上一页?"></a>
<a href = "baidu
"""+ str(int(PAGE) if (int(currentPage)+ 1) > int(PAGE) else (int(currentPage) + 1)) +"""
.htm"><input type = "button" value = "下一页?"></a>
</div>
<br>
<br>
"""
是不是有些长~其实看不看都无所谓的,里面就是拼接了一个字符串,需要注意的知识,除了javascript以外,就是python三个引号中间可以无视特殊含义了.
这样就像完成了.但我上面已经暴露了简单的分页.
因为帖子内容太多,我的策略是每一页生成一个html文件,文件名我自己定死.这部分需要注意的就是你需要获取到帖子到底有几页,怎么做呢?还是分析网页的内容
page = re.findall(r'回复贴.*?</span>'.decode('utf-8', 'ignore'), self.content)
pageConent = ''.join(page)
pageNum = re.search(r'>\d*?<', pageConent).group().replace('<', '').replace('>', '')
之后你只要发挥你现象对象的思想,将函数反复调用就行啦.附一个效果图
样式之类的就需要发挥javascript功底去进行优化了,我也做得一般.
我目前把这东西通过iframe显示在一个线上网站里面,可以选择自己喜欢的帖子,还可以.
全部代码
# -*- coding: utf-8 -*-
import urllib2
urllib_request = urllib2
import re
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class CatchUrl :
#self.content
#self.articleContent
#self.url
def __init__(self, url) :
self.url = url
response = urllib_request.urlopen(self.url, timeout = 60)
self.content = response.read().decode('utf-8', 'ignore')
def getAuthorList(self) :
parttern = re.compile(r'p_author.*?</a>')
authorer = re.findall(parttern, self.content)
authorerContent = ''.join(authorer)
authorer1 = re.findall(r'username=".*?"', authorerContent)
authorer1Content = ''.join(authorer1)
authorer2 = re.findall(r'".*?"', authorer1Content)
authorer2Content = ''.join(authorer2)
authorer2ContentFix = authorer2Content.replace('""', '|')
authorer2ContentFix1 = authorer2ContentFix.replace('"', '')
authorList = authorer2ContentFix1.split('|')
print(len(authorList))
return authorList
def getArticleList(self) :
article = re.findall(r'j_d_post_content.*?</div>', self.content)
articleContent = ''.join(article)
articleContent = articleContent.replace('j_d_post_content ">', '')
articleList = articleContent.split('</div>')
articleList.pop()
self.articleContent = ''.join(articleList)
print(len(articleList))
return articleList
def getImgList(self) :
img = re.findall(r'src=".*?"', self.articleContent)
imgContent = ''.join(img)
imgContent1 = imgContent.replace('src="', '')
imgContent2 = imgContent1.replace('"', '|')
imgList = imgContent2.split('|')
imgList.pop()
imgNameList = []
for i in range(len(imgList)) :
imgUris = imgList[i].split('/')
imgName = imgUris.pop()
imgNameList.insert(0, imgName)
imgNameList.reverse()
for i in range(len(imgNameList)) :
try :
filename = r'catchimg/' + imgNameList[i].strip()
if os.path.exists(filename) :
continue
conn = urllib_request.urlopen(imgList[i])
with open(filename, 'wb') as f :
f.write(conn.read())
f.close
except Exception :
print('img Error')
def getPage(self) :
page = re.findall(r'回复贴.*?</span>'.decode('utf-8', 'ignore'), self.content)
pageConent = ''.join(page)
pageNum = re.search(r'>\d*?<', pageConent).group().replace('<', '').replace('>', '')
return int(pageNum)
def catchPage(url, currentPage) :
global URL,PAGE
catchUrl = CatchUrl(url)
authorList = catchUrl.getAuthorList()
articleList = catchUrl.getArticleList()
catchUrl.getImgList()
postingList = []
while len(authorList) ^ 0 and len(authorList) == len(articleList) :
author = authorList.pop()
article = articleList.pop()#border:2px solid #B0C4DE
posting = '<div class = "myPosting" style = "padding:15px;font-size:14px;">' \
+'<div class = "myAuthor" style = "float:left;width:100px;margin:10px;clear:both;text-align:left;border-left:3px solid #ADD8E6;padding-left:5px">' \
+ author +':' + '</div>' \
+ '<div class = "myArticle" style = "width:80%;margin:10px;margin-left:110px;text-align:left;border:1px #9D9D9D solid;border-radius:3px;padding:10px;background:#F0F0F0">' \
+ article.lstrip() + '</div></div>'
postingList.insert(0, posting)
postingContent = ''.join(postingList)
postingContentAndJs = '<div id= "123000">' + postingContent + """
</div>
<script type = "text/javascript" >
var aHerfs = document.getElementById('123000').getElementsByTagName('a');
for (var i = 0; i < aHerfs.length; i++) {
aHerfs[i].onclick = function () { return false;};
aHerfs[i].href = '#';
aHerfs[i].style.color = '#000000';
aHerfs[i].style.cursor = 'default';
}
var imgs = document.getElementById('123000').getElementsByTagName('img');
for (var i = 0; i < imgs.length; i++) {
var src = imgs[i].src;
var srcUris = src.split('/');
var localSrc = 'catchimg/' + srcUris[srcUris.length - 1];
imgs[i].src = localSrc;
}
</script>
<br>
<div style = "width:100%;text-align:center">
<a href = "
"""+ url +"""
" target = "_blank"><input type = "button" value = "贴吧见!"></a>
|
<a href = "baidu
"""+ str(1 if (int(currentPage) - 1) < 1 else (int(currentPage) - 1)) +"""
.htm"><input type = "button" value = "上一页?"></a>
<a href = "baidu
"""+ str(int(PAGE) if (int(currentPage)+ 1) > int(PAGE) else (int(currentPage) + 1)) +"""
.htm"><input type = "button" value = "下一页?"></a>
</div>
<br>
<br>
"""
with open('baidu'+currentPage+'.htm', 'wb+') as f :
f.write(postingContentAndJs.encode('utf-8', 'ignore'))
f.close()
def main() :
global URL,PAGE
URL = 'http://tieba.baidu.com/p/1696552346'
catchUrl = CatchUrl(URL)
PAGE = catchUrl.getPage()
for i in range(PAGE) :
catchPage(URL + '?pn=' + str(i + 1), currentPage = str(i + 1))
global PAGE,URL
main()
复制过去保存为.py文件基本能用,帖子链接修改最后的URL部分即可
另外如果想在linux的corn自动运行的话,记得把所有路径换成绝对路径.