《2018年6月29日》【连续261天】
标题:爬取百度贴吧帖子;
内容:
本来想跟别人一起用urllib.request.Request爬取html的,但一运行就未响应,只好用requests了
url: https://tieba.baidu.com/p/3138733512?see_lz=1&pn=1&red_tag=1250477225
标题:

总页数:

发言:

看了别人的一些代码,加了剔除乱码的类:
import urllib
import requests
import re
#处理页面标签类
class Tool:
#去除img标签,7位长空格
removeImg = re.compile('<img.*?>| {7}|')
#删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为\t
replaceTD= re.compile('<td>')
#把段落开头换为\n加空两格
replacePara = re.compile('<p.*?>')
#将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile('<.*?>')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replacePara,"\n ",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
#strip()将前后多余内容删除
return x.strip()
class BDTB:
def __init__(self,baseUrl,seeLZ):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
self.tool =Tool()
def getPage(self,pageNum):
try:
url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S)
result = re.search(pattern,page)
if result:
return(result.group(1).strip())
else:
return None
def getPageNum(self):
page = self.getPage(1)
pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)
result = re.search(pattern,page)
if result:
return result.group(1).strip()
else:
return None
def getContent(self,page):
pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
items = re.findall(pattern,page)
floor = 1
for item in items:
print(floor,u"楼------------------------------------------------------------\n")
print(self.tool.replace(item))
floor += 1
def main():
baseURL ="http://tieba.baidu.com/p/3138733512"
bdtb =BDTB(baseURL,1)
n = bdtb.getPageNum()
title=bdtb.getTitle()
print(title)
for i in range(int(n)):
i +=1
bdtb.getContent(bdtb.getPage(i))
main()将baseURL改掉,即可爬取其他帖子,也可以将内容保存到文件中;
效果:
1976

被折叠的 条评论
为什么被折叠?



