参考:http://cuiqingcai.com/993.html
贴吧地址:https://tieba.baidu.com/p/3138733512?see_lz=1&pn=1
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
#百度贴吧:纯原创我心中的NBA2014-2015赛季现役50大
#实现功能:
#1.对百度贴吧的任意帖子进行抓取
#2.指定是否只抓取楼主发帖内容
#3.将抓取到的内容分析并保存到文件
# https://tieba.baidu.com/p/3138733512?see_lz=1&pn=1
#解释如下:
# http:// 代表资源传输使用http协议
# tieba.baidu.com 是百度的二级域名,指向百度贴吧的服务器。
# /p/3138733512 是服务器某个资源,即这个帖子的地址定位符
# see_lz和pn是该URL的两个参数,分别代表了只看楼主和帖子页码,等于1表示该条件为真
class BDTB:
#初始化方法,传入url,看是否只看楼主的参数
def __init__(self, baseUrl, seeLz):
self.baseurl = baseUrl
self.seelz = '?see_lz=' + str(seeLz)
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
#初始化headers
self.headers = { 'User-Agent' : self.user_agent }
#初始化tool类工具
self.tool = Tool()
#方法getPage,获取该页帖子代码的源码
def getPage(self, pageNum):
try:
url = self.baseurl + self.seelz + '&pn=' + str(pageNum)
request = urllib2.Request(url,headers = self.headers)
response = urllib2.urlopen(request)
pageCode = response.read().decode('utf-8')
return pageCode
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"百度贴吧链接失败,错误原因是:",e.reason
return None
#获取帖子标题(使用正则表达式)
def getTitle(self):
page = self.getPage(1)
if not page:
print "页面加载失败..."
return None
pattern = re.compile('<h3.*?class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>', re.S)
result = re.search(pattern, page)
if result:
print result.group(1).strip()
else:
print "None"
#获取帖子总页码
def getPageNum(self):
page = self.getPage(1)
if not page:
print "页面加载失败..."
return None
pattern = re.compile('<span class="red">(.*?)</span>', re.S)
result = re.search(pattern, page)
if result:
print result.group(1).strip()
else:
print "None"
#获取正文信息,最后并写入文件,注意字符的转换
#在此处说明一下:之前使用pageCode = response.read().decode('utf-8')请求到的网页内容用decode转换,这里的text是string类型用encode转换
def getContent(self):
page = self.getPage(1)
if not page:
print "页面加载失败..."
return None
pattern = re.compile('<div id="post_content.*?>(.*?)</div>', re.S)
results = re.findall(pattern, page)
#定义floor变量打印重新楼层
floor = 1
file_object = open('bdtb.txt','w')
try:
for result in results:
#print floor,u"楼------------------------------------------------------------------------\
#------------------------------------------------------------\n"
#print self.tool.replace(result)
#floor += 1
text = str(floor) + u"楼--------------------------------------------------------------------------------\n"\
+ self.tool.replace(result) + '\n'
file_object.write(text.encode('utf-8') + '\n')
floor += 1
finally:
file_object.close()
#处理页面标签类
class Tool:
#去除img标签,7位长空格
removeImg = re.compile('<img.*?>| {7}|')
#删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为\t
replaceTD= re.compile('<td>')
#把段落开头换为\n加空两格
replacePara = re.compile('<p.*?>')
#将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile('<.*?>')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replacePara,"\n ",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
#strip()将前后多余内容删除
return x.strip()
baseURL = 'https://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseURL, 1)
#bdtb.getPage(1)
#bdtb.getTitle()
#bdtb.getPageNum()
bdtb.getContent()