参照书上的代码一个个手打的
import requests
from bs4 import BeautifulSoup as BS
from mylog import MyLog as mylog
class Item(object):
title=None #帖子标题
firstAuthor=None #帖子创建者
firstTime=None #帖子创建时间
reNum=None #总回复数
contest=None #最后回复内容
lastAuthor=None #最后回复者
lastTime=None #最后回复时间
class GetPostBarInfo():
def __init__(self,url):
self.url=url
self.log=mylog()
self.pageSum=5
self.urls=self.getUrls(self.pageSum)
self.items=self.spider(self.urls)
self.pipelines(self.items)
def getUrls(self,pageSum):
urls=[]
pns=[str(i*50) for i in range(pageSum)]
ul=self.url.split('=')
for pn in pns:
ul[-1]=pn
url='='.join(ul)
urls.append(url)
return urls
def spider(self,urls):
items=[]
for url in urls:
htmlContent=self.getResponseContent(url)
soup=BS(htmlContent, 'lxml')
tagsli=soup.find_all('li', attrs={'class': ' j_thread_list clearfix'})
for tag in tagsli:
item=Item()
item.title=tag.find('a',attrs={'class':'j_th_tit'}).get_text().strip()
item.firstAuthor=tag.find('span',attrs={'class':'frs-author-name-wrap'}).a.get_text().strip()
item.firstTime=tag.find('span',attrs={'title':'创建时间'}).get_text().strip()
item.regNum=tag.find('span',attrs={'title':'回复'}).get_text().strip()
item.content=tag.find('div',attrs={'class':'threadlist_abs threadlist_abs_onlyline '}).get_text().strip()
item.lastAuthor=tag.find('span',attrs={'class':'tb_icon_author_rely j_replyer'}).a.get_text().strip()
item.lastTime=tag.find('span',attrs={'title':'最后回复时间'}).get_text().strip()
items.append(item)
self.log.info(u'获取标题为《%s》的项成功...' %item.title)
return items
def pipelines(self,items):
filename='贴吧_权利的游戏.txt'
with open(filename,'w',encoding='utf-8') as fp:
for item in items:
fp.write(
'title:{} \t author:{} \t firstTime:{} \ncontent:{} \n return:{} \n lastAuthor:{} \t lastTime:{} \n\n\n\n'
.format(item.title, item.firstAuthor, item.firstTime, item.content, item.reNum, item.lastAuthor,
item.lastTime))
def getResponseContent(self,url):
'''这里单独用一个函数返回页面返回值,是为了后期方便加入proxy和headers等'''
try:
response=requests.get(url)
except:
self.log.error('python 返回URL:%s 数据失败' % url)
else:
self.log.info('python 返回URLs:%s 数据成功' % url)
return response.text
if __name__ =='__main__':
url='http://tieba.baidu.com/f?kw=%E6%9D%83%E5%8A%9B%E7%9A%84%E6%B8%B8%E6%88%8F&ie=utf-8&pn=50'
GTI=GetPostBarInfo(url)