Python读取贴吧信息

最新推荐文章于 2021-06-10 22:36:26 发布

江南大富翁

最新推荐文章于 2021-06-10 22:36:26 发布

阅读量499

点赞数

参照书上的代码一个个手打的

import requests
from bs4 import BeautifulSoup as BS
from mylog import MyLog as mylog

class Item(object):
    title=None #帖子标题
    firstAuthor=None #帖子创建者
    firstTime=None #帖子创建时间
    reNum=None #总回复数
    contest=None #最后回复内容
    lastAuthor=None #最后回复者
    lastTime=None #最后回复时间

class GetPostBarInfo():
    def __init__(self,url):
        self.url=url
        self.log=mylog()
        self.pageSum=5
        self.urls=self.getUrls(self.pageSum)
        self.items=self.spider(self.urls)
        self.pipelines(self.items)
    def getUrls(self,pageSum):
        urls=[]
        pns=[str(i*50) for i in range(pageSum)]
        ul=self.url.split('=')
        for pn in pns:
            ul[-1]=pn
            url='='.join(ul)
            urls.append(url)

        return urls

    def spider(self,urls):
        items=[]
        for url in urls:
            htmlContent=self.getResponseContent(url)
            soup=BS(htmlContent, 'lxml')
            tagsli=soup.find_all('li', attrs={'class': ' j_thread_list clearfix'})
            for tag in tagsli:
                item=Item()
                item.title=tag.find('a',attrs={'class':'j_th_tit'}).get_text().strip()
                item.firstAuthor=tag.find('span',attrs={'class':'frs-author-name-wrap'}).a.get_text().strip()
                item.firstTime=tag.find('span',attrs={'title':'创建时间'}).get_text().strip()
                item.regNum=tag.find('span',attrs={'title':'回复'}).get_text().strip()
                item.content=tag.find('div',attrs={'class':'threadlist_abs threadlist_abs_onlyline '}).get_text().strip()
                item.lastAuthor=tag.find('span',attrs={'class':'tb_icon_author_rely j_replyer'}).a.get_text().strip()
                item.lastTime=tag.find('span',attrs={'title':'最后回复时间'}).get_text().strip()
                items.append(item)
                self.log.info(u'获取标题为《%s》的项成功...' %item.title)

        return items
    def pipelines(self,items):
        filename='贴吧_权利的游戏.txt'
        with open(filename,'w',encoding='utf-8') as fp:
            for item in items:
                fp.write(
                    'title:{} \t author:{} \t firstTime:{} \ncontent:{} \n return:{} \n lastAuthor:{} \t lastTime:{} \n\n\n\n'
                        .format(item.title, item.firstAuthor, item.firstTime, item.content, item.reNum, item.lastAuthor,
                                item.lastTime))
    def getResponseContent(self,url):
        '''这里单独用一个函数返回页面返回值，是为了后期方便加入proxy和headers等'''
        try:
            response=requests.get(url)
        except:
            self.log.error('python 返回URL:%s 数据失败' % url)
        else:
            self.log.info('python 返回URLs:%s 数据成功' % url)
            return response.text
if __name__ =='__main__':
    url='http://tieba.baidu.com/f?kw=%E6%9D%83%E5%8A%9B%E7%9A%84%E6%B8%B8%E6%88%8F&ie=utf-8&pn=50'
    GTI=GetPostBarInfo(url)